We would like to know which Pokemons have similar species strength each other and principal component analysis (PCA) can help us address this issue.
The following files are available on https://github.com/v0369012/Pokemon_PCA.
# Read the Pokemon value table (1st generation ~ 7th generation)
PKM_values_7 <- readLines("Pokemon_list_g7_TC.txt", encoding = "UTF-8")
To simplify the analysis, we removed Pokemons with special form, like Mega, Alolan…
# Loading packages
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Pokemons without special form have 8 elements per line
PKM_without_spf <- str_count(PKM_values_7, "\\|") == 8
PKM_values_7_without_spf <- PKM_values_7[PKM_without_spf]
PKM_values_7_without_spf_split <- str_split(PKM_values_7_without_spf, "\\|")
# Make a Pokemon table
PKM_values_7_without_spf_df <- data.frame(
Number = unlist(PKM_values_7_without_spf_split)[seq(2, 792*9, 9)],
Name = unlist(PKM_values_7_without_spf_split)[seq(3, 792*9, 9)],
generation = c(rep(1, 151-0), rep(2, 251-151), rep(3, 385-251), rep(4, 490-385), rep(5, 640-490), rep(6, 707-640), rep(7, 792-707)),
HP = unlist(PKM_values_7_without_spf_split)[seq(4, 792*9, 9)] %>% as.character() %>% as.numeric(),
ATK = unlist(PKM_values_7_without_spf_split)[seq(5, 792*9, 9)] %>% as.character() %>% as.numeric(),
DEF = unlist(PKM_values_7_without_spf_split)[seq(6, 792*9, 9)] %>% as.character() %>% as.numeric(),
SATK = unlist(PKM_values_7_without_spf_split)[seq(7, 792*9, 9)] %>% as.character() %>% as.numeric(),
SDEF = unlist(PKM_values_7_without_spf_split)[seq(8, 792*9, 9)] %>% as.character() %>% as.numeric(),
SPEED = unlist(PKM_values_7_without_spf_split)[seq(9, 792*9, 9)] %>% str_replace_all("\\}","") %>% as.character() %>% as.numeric()
)
# Check the table
head(PKM_values_7_without_spf_df)
## Number Name generation HP ATK DEF SATK SDEF SPEED
## 1 001 妙蛙種子 1 45 49 49 65 65 45
## 2 002 妙蛙草 1 60 62 63 80 80 60
## 3 003 妙蛙花 1 80 82 83 100 100 80
## 4 004 小火龍 1 39 52 43 60 50 65
## 5 005 火恐龍 1 58 64 58 80 65 80
## 6 006 噴火龍 1 78 84 78 109 85 100
# Address Pokemon types table
PKM_types_7 <- readLines("Pokemon_types.txt")
# Extract the Pokemon number
PKM_types_7_number <- c()
for (i in 1:876) {
PKM_types_7_number[i] <- str_split(PKM_types_7, "\\|")[[i]][[3]]
}
# Extract the Pokemon names
PKM_types_7_name <- c()
for (i in 1:876) {
PKM_types_7_name[i] <- str_split(PKM_types_7, "\\|")[[i]][[4]]
}
# We chose Pokemon first types for the analysis
PKM_types_7_types1 <- c()
for (i in 1:876) {
PKM_types_7_types1[i] <- str_split(PKM_types_7, "\\|")[[i]][[6]] %>% str_remove_all("\\}")
}
# Make Pokemon types 1 table
PKM_types_7_df <- data.frame(
Number = PKM_types_7_number,
Name = PKM_types_7_name,
types1 = PKM_types_7_types1
)
# Check the table
head(PKM_types_7_df)
## Number Name types1
## 1 001 Bulbasaur Grass
## 2 002 Ivysaur Grass
## 3 003 Venusaur Grass
## 4 004 Charmander Fire
## 5 005 Charmeleon Fire
## 6 006 Charizard Fire
# Remove Pokemon numbers containing letters
position_without_letters <- str_detect(PKM_types_7_number, "^[0-9]*$")
PKM_types_7_number_without_letters <- PKM_types_7_number[position_without_letters]
# Remove number with letters
PKM_types_7_df_t <- filter(PKM_types_7_df, Number %in% PKM_types_7_number_without_letters)
# Merge Pokemon table and types table by number
PKM_merged_df <- merge(PKM_types_7_df_t, PKM_values_7_without_spf_df, by = "Number")
# Check the correlation between the variables
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
head(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])))
## Var1 Var2 value
## 1 generation generation 1.00000000
## 2 HP generation 0.06737277
## 3 ATK generation 0.12726340
## 4 DEF generation 0.09396861
## 5 SATK generation 0.09037036
## 6 SDEF generation 0.06661734
# Plot heatmap
ggplot(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])),
aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "firebrick4", high = "steelblue",
mid = "white", midpoint = 0) +
guides(fill=guide_legend(title="Correlation")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())
# PCA
pca.model <- prcomp(PKM_merged_df[, 5:ncol(PKM_merged_df)], T)
# Show pca summary
summary(pca.model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 45.3407 30.5622 26.2864 22.8902 18.60125 14.52452
## Proportion of Variance 0.4314 0.1960 0.1450 0.1100 0.07261 0.04427
## Cumulative Proportion 0.4314 0.6274 0.7724 0.8823 0.95496 0.99922
## PC7
## Standard deviation 1.92186
## Proportion of Variance 0.00078
## Cumulative Proportion 1.00000
pca.model$rotation
## PC1 PC2 PC3 PC4 PC5
## generation 0.005173058 0.003138396 -0.002254479 0.001608106 0.007858027
## HP 0.365911209 -0.014089097 -0.126630236 0.795808412 -0.333377213
## ATK 0.464159122 0.093172879 -0.714863910 -0.058629494 0.166243167
## DEF 0.419466963 0.647724808 0.047193615 -0.369365737 0.034422515
## SATK 0.463676640 -0.390824439 0.339767984 0.089206914 0.706333101
## SDEF 0.420806046 0.127751437 0.577626506 -0.039023568 -0.399765049
## SPEED 0.287423468 -0.634427982 -0.146994523 -0.466194387 -0.448617477
## PC6 PC7
## generation 0.005800753 0.9999301595
## HP -0.324688598 0.0010893090
## ATK 0.483426571 -0.0083220613
## DEF -0.514447494 -0.0007887328
## SATK -0.099614896 -0.0055224474
## SDEF 0.558222907 -0.0013096278
## SPEED -0.273221981 0.0060330843
# Make a pca table to plot
p1_p2_table <- pca.model$x[,1:2] %>% as.data.frame()
rownames(p1_p2_table) <- PKM_merged_df[,1]
legend_number <- c(144:146, 150:151,
243:245, 249:251,
377:386,
479:494,
638:649,
716:721,
785:809
)
legend <- rep(F, nrow(p1_p2_table))
legend_position <- which(PKM_merged_df[, "Number"] %in% legend_number)
legend[legend_position] <- rep(T, length(legend_position))
p1_p2_table_t <- cbind(p1_p2_table,
Number = PKM_merged_df[, "Number"],
Name = PKM_merged_df[, "Name.y"],
generation = PKM_merged_df["generation"],
types1 = PKM_merged_df[, "types1"],
legend = legend
)
p1_p2_table_t[, "generation"] <- as.character(p1_p2_table_t[, "generation"])
# Visualization
pca_gg <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name))+
geom_point(size = 2.5)
# label with generation
pca_gg_generation <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name, color = generation))+
geom_point(size = 2.5)
# label with first types
pca_gg_types1 <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name, color = types1))+
geom_point(size = 2.5)
# label with legend Pokemon
pca_gg_legend <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name, color = legend))+
geom_point(size = 2.5)
# User-interactive visualization
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pca_ggly <- ggplotly(pca_gg)
pca_ggly
pca_ggly_generation <- ggplotly(pca_gg_generation)
pca_ggly_generation
pca_ggly_types1 <- ggplotly(pca_gg_types1)
pca_ggly_types1
pca_ggly_legend <- ggplotly(pca_gg_legend)
pca_ggly_legend